#libraries
library(readr)
library(tidyverse)
library(dplyr)
library(ggmap)
library(ggplot2)
library(ggpubr)
arrests <- read_csv("https://raw.githubusercontent.com/paulmj7/STOR320_Group0_Project/master/arrests.csv")
head(arrests)
## # A tibble: 6 x 22
## X1 id `Primary Charge` Street Street1 Street2 City State Zipcode
## <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 1 40515 ASSAULT-SIMPLE 117 O~ OLD DU~ <NA> CHAP~ NC 27517
## 2 2 40498 FAIL TO APPEAR/~ 214 W~ WEST R~ <NA> CHAP~ NC 27516
## 3 3 40496 PUBLIC URINATION 107 E~ EPHESU~ LEGION~ CHAP~ NC 27517
## 4 4 40497 AFFRAY/ASSAULT ~ 117 O~ OLD DU~ <NA> CHAP~ NC 27517
## 5 5 40495 ASSAULT-SIMPLE 233 M~ MCCAUL~ <NA> CHAP~ NC 27514
## 6 6 40492 POSS MARIJUANA ~ 143 W~ WEST F~ CHURCH~ CHAP~ NC 27516
## # ... with 13 more variables: `Date of Arrest` <date>, `Time of Arrest` <time>,
## # Age <dbl>, Race <chr>, Gender <chr>, Ethnicity <chr>, `Type of
## # Arrest` <chr>, `Drugs or Alcohol Present` <chr>, `Weapon Present` <chr>,
## # Disposition <chr>, latitude_longitude <chr>, latitude <dbl>,
## # longitude <dbl>
#cleaning code to sort arrests into latlong group and save grouping lines
arrests <- filter(arrests, longitude < -78.95 & longitude >-79.11 & latitude > 35.85 & latitude < 35.98)
arrests <- within(arrests, {
grp.lat = cut(latitude, 10, labels = FALSE)
grp.long = cut(longitude, 10, labels = FALSE)
})
vec_group_long <- vector()
for (i in 1:10) {
vec_group_long[i] <- min(arrests$longitude[arrests$grp.long==i])
}
vec_group_long[11] <- -78.95
vec_group_lat <- vector()
for (i in 1:10) {
vec_group_lat[i] <- min(arrests$latitude[arrests$grp.lat==i])
}
vec_group_lat[11] <- 35.98
Graphs that may be useful
arrests$weekday <- weekdays(arrests$`Date of Arrest`)
arrests$weekday <- factor(arrests$weekday, level = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
x <- group_by(arrests, weekday) %>%
summarise(count = n()) %>%
ggplot(aes(x=weekday, y=count)) + geom_col(fill='steel blue') + theme_minimal() +
labs(x="Weekday", y="Number of Arrests", title = "Number of Arrests by Weekday")
arrests$month <- months(arrests$'Date of Arrest')
arrests$month <- factor(arrests$month, level = c("January", "February", "March", "April", "May","June", "July", "August", "September", "October", "November", "December"))
y <- group_by(arrests, month) %>%
summarise(count = n()) %>%
ggplot(aes(x=month, y=count)) + geom_col(fill='steel blue') +
theme_minimal() + theme(axis.text.x =element_text(angle = 45)) +
xlab("Month") + ylab("Number of Arrests") + labs(title="Number of Arrests by Month from Dataset")
figure <- ggarrange(x, y,
ncol = 1, nrow = 2)
figure

#survey <- read_csv("C:/Users/gabri/OneDrive/Documents/Junior Year/Spring '20/STOR 320/Project/cleaned_survey.csv")
survey <- read_delim("https://raw.githubusercontent.com/paulmj7/STOR320_Group0_Project/master/q5-public-safety-police-services.csv", ";", escape_double = FALSE, trim_ws = TRUE)
head(survey)
## # A tibble: 6 x 13
## id `Q5-5. Overall ~ `Q5-6. The visi~ `Q5-7. The Town~ `Q5-8. How quic~
## <dbl> <chr> <chr> <chr> <chr>
## 1 128 Very Satisfied Very Satisfied Very Satisfied Very Satisfied
## 2 169 Very Satisfied Very Satisfied Very Satisfied Very Satisfied
## 3 243 Very Satisfied Satisfied Satisfied Don't Know
## 4 308 Very Satisfied Very Satisfied Very Satisfied Very Satisfied
## 5 338 Neutral Neutral Satisfied Don't Know
## 6 379 Very Satisfied Very Satisfied Very Satisfied Very Satisfied
## # ... with 8 more variables: `Q5-9. Enforcement of local traffic laws` <chr>,
## # `Q5-10. Police safety education programs` <chr>, `Q5-11. Chapel Hill Police
## # Department's overall performance` <chr>, `Q5-12. The attitude and behavior
## # of Police Department personnel toward residents` <chr>, `Q5-13. The level
## # of safety and security in your neighborhood` <chr>, LATITUDE <dbl>,
## # LONGITUDE <dbl>, location <chr>
#cleaning code for survey data
survey <-rename(survey,
Protection_Quality = 'Q5-5. Overall quality of local police protection',
Visibility = 'Q5-6. The visibility of police in neighborhoods',
Prevention = "Q5-7. The Town's efforts to prevent crime",
Reponse = "Q5-8. How quickly police respond to emergencies",
Traffic_Enforcement = "Q5-9. Enforcement of local traffic laws",
Education_Programs = "Q5-10. Police safety education programs",
Overall_Performance = "Q5-11. Chapel Hill Police Department's overall performance",
Attitude = "Q5-12. The attitude and behavior of Police Department personnel toward residents",
Safety = "Q5-13. The level of safety and security in your neighborhood")
survey2 <- survey
for ( i in 2:10) {
survey2[i] <- sapply(survey2[i], function(x) as.integer(factor(as.character(x), levels=c("Very Dissatisfied", "Dissatisfied", "Neutral", "Satisfied", "Very Satisfied"))))
}
#functions to use same groups created for arrest data for satisfaction data
assign_latgroup <- function (x) {
for ( i in 1:10) {
if (vec_group_lat[i] <= x & x < vec_group_lat[i+1]) {
return (i)
}
}
return (NA)
}
assign_longgroup <- function(x) {
for ( i in 1:10) {
if (vec_group_long[i] <= x & x < vec_group_long[i+1]) {
return(i)
}
}
return(NA)
}
#create groups for survey data
for (i in 1:nrow(survey2)) {
survey2$grp.lat[i] <- assign_latgroup(survey2$LATITUDE[i])
}
for( i in 1:nrow(survey2)) {
survey2$grp.long[i] <- assign_longgroup((survey2$LONGITUDE[i]))
}
survey <- survey2
Survey data
#map of surveys collected with latitude and longitude grouping lines
# can add facet = 'Overall_Performance' to get maps of answers
library(ggmap)
register_google(key = "AIzaSyAz_F8FVVhZbjfsz4rCz4cEigy6__5cQKc")
survey2 <- survey %>%
mutate_at(vars(Overall_Performance), funs(factor))
qmplot(LONGITUDE, LATITUDE, data=survey2, maptype='toner-lite', color = Overall_Performance,
xlim= c(vec_group_long[1], vec_group_long[11]),
ylim= c(vec_group_lat[1], vec_group_lat[11]), f=.8) +
geom_hline(yintercept = vec_group_lat) +
geom_vline(xintercept = vec_group_long) +
ggtitle("Location of Surveys by Overall Performance Score")

#count of latlong groups for survey data
x1 <- survey %>%
group_by(grp.lat, grp.long) %>%
summarise(count = n()) %>%
arrange(desc(count))
x2 <- survey %>%
group_by(grp.lat, grp.long) %>%
summarise(count = n()) %>%
ggplot(aes(x=grp.long, y=grp.lat, fill = count)) +
geom_tile() +
scale_fill_viridis_c() +
theme_classic() +
xlim(.5, 10.5) +
scale_y_continuous(breaks = seq(1, 10, by = 1)) +
scale_x_continuous( breaks = seq(1, 10, by = 1)) +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
plot.title = element_text(size=8),
legend.title = element_text(size=8),
axis.title = element_text(size=9)) +
geom_vline(xintercept = seq(.5, 10.5, 1)) +
geom_hline(yintercept = seq(.5, 10.5, 1)) +
ggtitle("Number of Surveys Collected by LatLong Grouping") +
xlab("Longitude Group") +
ylab("Latitude Group")
#Average Overall Preformance score by latlong groups
y1 <- survey %>%
filter(!is.na(Overall_Performance)) %>%
group_by(grp.lat, grp.long) %>%
summarise(satisfaction = mean(Overall_Performance, na.rm=TRUE))
y2 <- survey %>%
filter(!is.na(Overall_Performance)) %>%
group_by(grp.lat, grp.long) %>%
summarise(satisfaction = mean(Overall_Performance, na.rm=TRUE)) %>%
ggplot(aes(x=grp.long, y=grp.lat, fill = satisfaction)) +
geom_tile() +
scale_fill_viridis_c() +
theme_classic() +
xlim(.5, 10.5) +
scale_y_continuous(breaks = seq(1, 10, by = 1)) +
scale_x_continuous( breaks = seq(1, 10, by = 1)) +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
plot.title = element_text(size=8),
legend.title = element_text(size=8),
axis.title = element_text(size=9)) +
geom_vline(xintercept = seq(.5, 10.5, 1)) +
geom_hline(yintercept = seq(.5, 10.5, 1)) +
ggtitle("Average Overall Performance score by LatLong Grouping") +
xlab("Longitude Group") +
ylab("Latitude Group")
# average of each person's answers to all questions, grouped by latlong group and then averaged
z1 <- survey %>%
mutate(average_score = rowMeans(survey[, 2:10], na.rm= TRUE)) %>%
na.omit(average_score) %>%
group_by(grp.lat, grp.long) %>%
summarise(Overall_average = mean(average_score))
z2 <- survey %>%
mutate(average_score = rowMeans(survey[, 2:10], na.rm= TRUE)) %>%
na.omit(average_score) %>%
group_by(grp.lat, grp.long) %>%
summarise(Overall_average = mean(average_score)) %>%
ggplot(aes(x=grp.long, y=grp.lat, fill = Overall_average)) +
geom_tile() +
scale_fill_viridis_c() +
theme_classic() +
xlim(.5, 10.5) +
scale_y_continuous(breaks = seq(1, 10, by = 1)) +
scale_x_continuous( breaks = seq(1, 10, by = 1)) +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
plot.title = element_text(size=8),
legend.title = element_text(size=8),
axis.title = element_text(size=9)) +
geom_vline(xintercept = seq(.5, 10.5, 1)) +
geom_hline(yintercept = seq(.5, 10.5, 1)) +
ggtitle("Average of Survey Questions by LatLong Grouping") +
xlab("Longitude Group") +
ylab("Latitude Group")
full_join(x1, y1, by = c("grp.lat", "grp.long")) %>%
full_join(z1, by = c("grp.lat", "grp.long")) %>%
as.data.frame()
## grp.lat grp.long count satisfaction Overall_average
## 1 3 3 27 4.000000 3.820513
## 2 7 7 27 4.346154 4.097222
## 3 9 6 24 4.260870 4.464646
## 4 6 4 22 3.818182 3.791667
## 5 8 6 22 4.000000 3.777778
## 6 4 3 20 4.050000 4.255556
## 7 8 4 19 4.222222 4.349206
## 8 9 4 19 3.833333 4.088889
## 9 8 5 18 4.214286 4.236111
## 10 6 5 17 4.000000 3.730159
## 11 9 5 17 3.875000 3.944444
## 12 5 7 15 4.333333 4.800000
## 13 5 4 13 3.769231 3.577778
## 14 5 6 13 4.181818 4.111111
## 15 8 7 13 4.375000 4.555556
## 16 6 3 12 4.200000 4.083333
## 17 6 7 12 3.818182 3.600000
## 18 7 5 12 4.166667 4.138889
## 19 6 6 11 4.090909 4.222222
## 20 7 3 10 4.000000 3.861111
## 21 7 6 9 4.142857 4.277778
## 22 4 4 8 3.857143 4.055556
## 23 9 3 7 4.142857 3.955556
## 24 4 5 6 4.333333 4.333333
## 25 4 7 5 3.400000 3.833333
## 26 7 4 5 3.250000 3.814815
## 27 6 8 4 4.000000 4.555556
## 28 8 3 4 4.250000 4.000000
## 29 10 6 4 4.500000 4.055556
## 30 5 5 3 4.500000 3.777778
## 31 9 7 3 2.500000 2.722222
## 32 10 4 3 5.000000 4.444444
## 33 3 5 2 4.000000 5.000000
## 34 5 3 2 4.000000 5.000000
## 35 10 3 2 4.000000 4.000000
## 36 7 8 1 5.000000 4.888889
figure <- ggarrange(x2, y2, z2,
nrow = 2, ncol = 2)
figure

#grouped average score for answered questions
survey_satisfaction <- survey %>%
mutate(average_score = rowMeans(survey[, 2:10], na.rm= TRUE)) %>%
na.omit(average_score) %>%
group_by(grp.lat, grp.long) %>%
summarise(Overall_average = mean(average_score),
Surveys = n())
#grouped number of arrests by latlong
arrests2 <- arrests %>%
group_by(grp.lat, grp.long) %>%
summarise(num_arrests = n())
#combined grouped latlongs with number of arrests and average survey scores
combined <- left_join(survey_satisfaction, arrests2, by = c("grp.lat", "grp.long")) %>%
mutate_at(vars(num_arrests),~replace(., is.na(.), 0) )
combined
## # A tibble: 36 x 5
## # Groups: grp.lat [8]
## grp.lat grp.long Overall_average Surveys num_arrests
## <int> <int> <dbl> <int> <dbl>
## 1 3 3 3.82 13 99
## 2 3 5 5 1 4
## 3 4 3 4.26 10 209
## 4 4 4 4.06 4 224
## 5 4 5 4.33 4 85
## 6 4 7 3.83 2 49
## 7 5 3 5 1 718
## 8 5 4 3.58 5 5546
## 9 5 5 3.78 1 91
## 10 5 6 4.11 5 232
## # ... with 26 more rows
ggplot(combined, aes(x=Overall_average, y=num_arrests)) +
geom_point(color='steel blue') +
ggtitle("Average Score by Number of Arrests in LatLong Group") +
ylab("Number of Arrests in LatLong")+
xlab("Average Police Score in LatLong") +
theme_minimal()

#filter(combined, !(grp.lat == 9 & grp.long == 7)) %>%
# ggplot(aes(x=Overall_average, y=num_arrests)) +
#geom_point(color='steel blue') +
#ggtitle("Average Score by Number of Arrests in LatLong Group") +
#ylab("Number of Arrests in LatLong")+
#xlab("Average Police Score in LatLong") +
#theme_minimal()
Arrest Data
qmplot(longitude, latitude, data=arrests, geom = "blank",
zoom = 13, maptype = "toner-background",legend = "topright") +
stat_density_2d(aes(fill = ..level..), size=0.01, bins=80, geom = "polygon", alpha = .3, color = NA) +
scale_fill_viridis_c()+
guides(fill = guide_colorbar(barwidth = 1.5, barheight = 10)) +
ggtitle("Density Map of Arrests")

t <- arrests %>%
filter(grepl("DWI", arrests$`Primary Charge`, ignore.case = TRUE))
w <-qmplot(longitude, latitude, data = t, maptype='toner-lite',color=I("dark blue"),
xlim= c(vec_group_long[1], vec_group_long[11]),
ylim= c(vec_group_lat[1], vec_group_lat[11]), f=.8, zoom=12) +
geom_hline(yintercept = vec_group_lat, alpha = .5) +
geom_vline(xintercept = vec_group_long, alpha = .5) +
ggtitle("DWI Arrests")
t <- arrests %>%
filter(grepl("OPEN", arrests$`Primary Charge`, ignore.case = TRUE))
x <-qmplot(longitude, latitude, data = t, maptype='toner-lite',color=I("red"),
xlim= c(vec_group_long[1], vec_group_long[11]),
ylim= c(vec_group_lat[1], vec_group_lat[11]), f=.8, zoom = 12) +
geom_hline(yintercept = vec_group_lat, alpha = .5) +
geom_vline(xintercept = vec_group_long, alpha = .5) +
ggtitle("Open Container Arrests")
t <- arrests %>%
filter(grepl("APPEAR", arrests$`Primary Charge`, ignore.case = TRUE))
y <-qmplot(longitude, latitude, data = t, maptype='toner-lite',color=I("orange"),
xlim= c(vec_group_long[1], vec_group_long[11]),
ylim= c(vec_group_lat[1], vec_group_lat[11]), f=.8, zoom = 12) +
geom_hline(yintercept = vec_group_lat, alpha = .5) +
geom_vline(xintercept = vec_group_long, alpha = .5) +
ggtitle("Failure to Appear Arrests")
t <- arrests %>%
filter(grepl("ASSAULT", arrests$`Primary Charge`, ignore.case = TRUE))
z <-qmplot(longitude, latitude, data = t, zoom = 12, maptype='toner-lite',color=I("purple"),
xlim= c(vec_group_long[1], vec_group_long[11]),
ylim= c(vec_group_lat[1], vec_group_lat[11]), f=.8) +
geom_hline(yintercept = vec_group_lat, alpha = .5) +
geom_vline(xintercept = vec_group_long, alpha = .5) +
ggtitle("Assault Arrests")
library(ggpubr)
figure <- ggarrange(w, x, y, z,
ncol = 2, nrow = 2)
figure

t <- arrests %>%
filter(grepl("MARIJUANA", arrests$`Primary Charge`, ignore.case = TRUE))
x <- qmplot(longitude, latitude, data = t, maptype='toner-lite',color=I("dark green"),
xlim= c(vec_group_long[1], vec_group_long[11]),
ylim= c(vec_group_lat[1], vec_group_lat[11]), f=.8) +
geom_hline(yintercept = vec_group_lat, alpha=.5) +
geom_vline(xintercept = vec_group_long, alpha=.5) +
ggtitle("Marijuana Arrests")
t <- arrests %>%
filter(grepl("LARCENY", arrests$`Primary Charge`, ignore.case = TRUE))
y <- qmplot(longitude, latitude, data = t, maptype='toner-lite',color=I("sky blue"),
xlim= c(vec_group_long[1], vec_group_long[11]),
ylim= c(vec_group_lat[1], vec_group_lat[11]), f=.8) +
geom_hline(yintercept = vec_group_lat, alpha=.5) +
geom_vline(xintercept = vec_group_long, alpha=.5) +
ggtitle("Larceny Arrests")
t <- arrests %>%
filter(grepl("SHOP", arrests$`Primary Charge`, ignore.case = TRUE))
z <- qmplot(longitude, latitude, data = t, maptype='toner-lite',color=I("magenta"),
xlim= c(vec_group_long[1], vec_group_long[11]),
ylim= c(vec_group_lat[1], vec_group_lat[11]), f=.8) +
geom_hline(yintercept = vec_group_lat, alpha=.5) +
geom_vline(xintercept = vec_group_long, alpha=.5) +
ggtitle("Shoplifting Arrests")
figure <- ggarrange(x, y, z,
nrow = 2, ncol = 2)
figure

arrest2 <- arrests %>%
select(latitude:longitude) %>%
mutate(latitude = pi*latitude/180, longitude = pi*longitude/180)
haversine <- function(lat1, lat2, long1, long2){
# function written for transparency in matching with haversine formula coordinates must be in radians
h <- sin(.5 * (lat2 - lat1)) ^ 2 + cos(lat1) * cos(lat2) * (sin(.5 * (long2 - long1)) ^ 2)
# earth diameter taken from http://imagine.gsfc.nasa.gov/features/cosmic/earth_info.html in miles
r <- (12756 / 2) * 0.621371
return(2 * r * asin(sqrt(h)))
}
d <- function(x, y){
haversine(x[1], y[1], x[2], y[2])
}
library(proxy)
#Dmat <- proxy::dist(arrest2, method = d)
library(cluster)
K <- 5
set.seed(1305)
#m <- pam(Dmat, k = K, diss = TRUE)
load("pamobject_arrests.Rdata")
arrest2 <- mutate(arrest2, cluster = factor(m$cluster))
group_by(arrest2, cluster) %>%
summarise(n = n(), latitude = mean(latitude),
longitude = mean(longitude))
## # A tibble: 5 x 4
## cluster n latitude longitude
## <fct> <int> <dbl> <dbl>
## 1 1 3217 0.627 -1.38
## 2 2 5796 0.627 -1.38
## 3 3 3560 0.627 -1.38
## 4 4 1675 0.628 -1.38
## 5 5 1835 0.627 -1.38
library(ggmap)
register_google(key = "AIzaSyAz_F8FVVhZbjfsz4rCz4cEigy6__5cQKc")
x <- mutate(arrest2, latitude = 180*latitude/pi, longitude = 180*longitude/pi) %>%
filter(longitude < -78.95 & longitude >-79.11 & latitude > 35.85 & latitude < 35.98)
qmplot(longitude, latitude, data=x, maptype='toner-lite',color=cluster)

a <- qmplot(longitude, latitude, data=arrests, maptype='toner-lite',color=I("red"), zoom =13) + geom_hline(yintercept = vec_group_lat, alpha=.5) +geom_vline(xintercept = vec_group_long, alpha=.5) + ggtitle("Arrests Divided into 100 Sectors")
x2 <- arrests %>%
group_by(grp.lat, grp.long) %>%
summarise(count=n())
b <- ggplot(x2, aes(x=grp.long, y=grp.lat, fill = count)) +
geom_tile() +
scale_fill_viridis_c() +
theme_classic() +
xlim(.5, 10.5) +
scale_y_continuous(breaks = seq(1, 10, by = 1)) +
scale_x_continuous( breaks = seq(1, 10, by = 1)) +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank()) +
geom_vline(xintercept = seq(.5, 10.5, 1)) +
geom_hline(yintercept = seq(.5, 10.5, 1)) +
ggtitle("Arrests Per LatLong Group") +
xlab("Longitude Group") +
ylab("Latitude Group")
x2
## # A tibble: 68 x 3
## # Groups: grp.lat [10]
## grp.lat grp.long count
## <int> <int> <int>
## 1 1 2 16
## 2 1 3 2
## 3 2 2 2
## 4 2 3 13
## 5 2 5 3
## 6 3 2 7
## 7 3 3 99
## 8 3 4 39
## 9 3 5 4
## 10 4 2 5
## # ... with 58 more rows
figure <-ggarrange(a,b,nrow=1, ncol=2, widths = c(15, 25))
figure
